{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "69aa518f-4870-499d-9689-2a0411f77fd8",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "%pip install ludwig==0.10.0 ludwig[llm] torch==2.1.2 PyYAML==6.0 datasets==2.18.0 pandas==2.1.4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "1fb03a95-3298-454c-b0dd-7346dffea3af",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "import yaml\n",
    "import logging\n",
    "import torch\n",
    "import datasets\n",
    "import pandas as pd\n",
    "from ludwig.api import LudwigModel"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "a561cc98-c3ce-48ab-a20b-6cbd14589bbb",
     "showTitle": false,
     "title": ""
    }
   },
   "source": [
    "View Dataset: https://github.com/yizhongw/self-instruct "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "be6e10b8-7313-46c2-b0a7-3c7279de7cb3",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "data = datasets.load_dataset(\"tatsu-lab/alpaca\")\n",
    "df = pd.DataFrame(data[\"train\"])\n",
    "df = df[[\"instruction\", \"input\", \"output\"]]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "3cd59017-f182-44f0-8af8-45cebab560cb",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import logging\n",
    "from ludwig.api import LudwigModel\n",
    "\n",
    "# Set your Hugging Face authentication token here\n",
    "hugging_face_token = \"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\"\n",
    "os.environ[\"HUGGING_FACE_HUB_TOKEN\"] = hugging_face_token\n",
    "\n",
    "qlora_fine_tuning_config = yaml.safe_load(\n",
    "\"\"\"\n",
    "model_type: llm\n",
    "base_model: mistralai/Mistral-7B-Instruct-v0.2\n",
    "\n",
    "input_features:\n",
    "  - name: instruction\n",
    "    type: text\n",
    "\n",
    "output_features:\n",
    "  - name: output\n",
    "    type: text\n",
    "\n",
    "prompt:\n",
    "  template: >-\n",
    "    Below is an instruction that describes a task, paired with an input\n",
    "    that provides further context. Write a response that appropriately\n",
    "    completes the request.\n",
    "\n",
    "    ### Instruction: {instruction}\n",
    "\n",
    "    ### Input: {input}\n",
    "\n",
    "    ### Response:\n",
    "\n",
    "generation:\n",
    "  temperature: 0.1\n",
    "  max_new_tokens: 64\n",
    "\n",
    "adapter:\n",
    "  type: lora\n",
    "\n",
    "quantization:\n",
    "  bits: 4\n",
    "\n",
    "preprocessing:\n",
    "  global_max_sequence_length: 512\n",
    "  split:\n",
    "    type: random\n",
    "    probabilities:\n",
    "    - 0.95\n",
    "    - 0\n",
    "    - 0.05\n",
    "\n",
    "trainer:\n",
    "  type: finetune\n",
    "  epochs: 1 # Typically, you want to set this to 3 epochs for instruction fine-tuning\n",
    "  batch_size: 1\n",
    "  eval_batch_size: 2\n",
    "  optimizer:\n",
    "    type: paged_adam\n",
    "  gradient_accumulation_steps: 16\n",
    "  learning_rate: 0.0004\n",
    "  learning_rate_scheduler:\n",
    "    decay: cosine\n",
    "    warmup_fraction: 0.03\n",
    "\"\"\"\n",
    ")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "363670ff-f428-4eea-af7b-ebc878085971",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "model = LudwigModel(config=qlora_fine_tuning_config, logging_level=logging.INFO)\n",
    "results = model.train(dataset=df[:5000])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "641ae903-400f-4d3c-b870-efbf04859612",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "test_examples = pd.DataFrame([\n",
    "    {\n",
    "        \"instruction\": \"Name two famous authors from the 18th century.\",\n",
    "        \"input\": \"\",\n",
    "    },\n",
    "    {\n",
    "        \"instruction\": \"Develop a list of possible outcomes of the given scenario\",\n",
    "        \"input\": \"A fire has broken out in an old abandoned factory.\",\n",
    "    },\n",
    "    {\n",
    "        \"instruction\": \"Tell me what you know about mountain ranges.\",\n",
    "        \"input\": \"\",\n",
    "    },\n",
    "    {\n",
    "        \"instruction\": \"Compose a haiku describing the summer.\",\n",
    "        \"input\": \"\",\n",
    "    },\n",
    "    {\n",
    "        \"instruction\": \"Analyze the given legal document and explain the key points.\",\n",
    "        \"input\": 'The following is an excerpt from a contract between two parties, labeled \"Company A\" and \"Company B\": \\n\\n\"Company A agrees to provide reasonable assistance to Company B in ensuring the accuracy of the financial statements it provides. This includes allowing Company A reasonable access to personnel and other documents which may be necessary for Company B’s review. Company B agrees to maintain the document provided by Company A in confidence, and will not disclose the information to any third parties without Company A’s explicit permission.',\n",
    "    },\n",
    "])\n",
    "\n",
    "predictions = model.predict(test_examples, generation_config={\"max_new_tokens\": 64, \"temperature\": 0.1})[0]\n",
    "for input_with_prediction in zip(test_examples['instruction'], test_examples['input'], predictions['output_response']):\n",
    "    print(f\"Instruction: {input_with_prediction[0]}\")\n",
    "    print(f\"Input: {input_with_prediction[1]}\")\n",
    "    print(f\"Generated Output: {input_with_prediction[2][0]}\")\n",
    "    print(\"\\n\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "bd7bdbaf-f98c-40e3-83f7-15c3921a7150",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "%sh\n",
    "huggingface-cli login --token \"<your-huggingface-token>\"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 0,
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {
      "byteLimit": 2048000,
      "rowLimit": 10000
     },
     "inputWidgets": {},
     "nuid": "6ecc7ae9-63d3-49df-b845-a1fa41cf406c",
     "showTitle": false,
     "title": ""
    }
   },
   "outputs": [],
   "source": [
    "%sh\n",
    "ludwig upload hf_hub --repo_id \"<hugging-face_repo-id>\"\" --model_path \"<your fine-tuned-model-path>\"\""
   ]
  }
 ],
 "metadata": {
  "application/vnd.databricks.v1+notebook": {
   "dashboards": [],
   "language": "python",
   "notebookMetadata": {
    "mostRecentlyExecutedCommandWithImplicitDF": {
     "commandId": 2824690123562440,
     "dataframes": [
      "_sqldf"
     ]
    },
    "pythonIndentUnit": 2
   },
   "notebookName": "Ludwig Fine Tuning Final - No Output",
   "widgets": {}
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
